import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
data = pd.read_csv('Data/data.csv')
# import graph objects as "go"
# pip install plotly
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot, plot
data.describe()
data.columns
duration = data['gameDuration']/60 # convert second to minute.
plt.figure(figsize=(10,5))
sns.distplot(data['gameDuration']/60, hist=True, kde=False)
plt.xlabel('Minutes')
plt.ylabel('Number of games')
plt.show()
Note: Most matches have the matching time around 10 to 15 minutes
blue = data['blueWin'].mean()
red = data['redWin'].mean()
colors = ['lightskyblue', 'lightcoral']
sizes = [blue, red]
labels = ['Blue', 'Red']
plt.figure(figsize=(10,5))
plt.pie(sizes, autopct='%1.1f%%', startangle=140, colors=colors, labels=labels)
plt.show()
bluefeats = data.columns[data.columns.str.contains(pat = 'blue')]
bluefeats
blue_df = data[bluefeats]
blue_df.head()
blue_df.shape
Both teams have the same attributes or features, so we just consider one side to understand the data.
redfeats = data.columns[data.columns.str.contains(pat = 'red')]
redfeats
red_df = data[redfeats]
red_df.head()
red_df.shape
blue_df.describe()
red_df.describe()
blue_corr = data.corr()['blueWin'].sort_values(axis=0, ascending=False)
red_corr = data.corr()['redWin'].sort_values(axis=0, ascending=False)
print(red_corr.head())
corr_columns = [feat for feat,corr in blue_corr.iteritems() if abs(corr)>0.5] # columns that have more than 0.5 of corr
plt.figure(figsize=(10,10))
sns.set(font_scale = 1)
sns.heatmap(data[corr_columns].corr(), annot=True)
plt.figure(figsize=(4,7))
red_corra = [feat for feat,corr in blue_corr.iteritems() if 'red' in feat] # list of columns that belong to red team
red_corra = red_corr[red_corra].sort_values(axis=0, ascending=False).to_frame()
sns.heatmap(red_corra, annot=True)
plt.figure(figsize=(4,7))
blue_corra = [feat for feat,corr in blue_corr.iteritems() if 'blue' in feat] # list of columns that belong to red team
blue_corra = blue_corr[blue_corra].sort_values(axis=0, ascending=False).to_frame()
sns.heatmap(red_corra, annot=True, cmap='Blues')
Comments:
blue_df.hist(bins=50, color='#084177', figsize=(20,15))
plt.show()
red_df.hist(bins=50, color='#d63447', figsize=(20,15))
plt.show()
fig = go.Figure(data=[
go.Histogram(x=blue_df['blueWardskilled'],name='Blue Win'),
go.Histogram(x=red_df['redWardskilled'],name='Red Win')
])
fig.update_layout(
title='Wards Destroyed Distribution',
height=800,
width=800
)
iplot(fig)
Comments:
fig = go.Figure(data=[
go.Histogram(x=blue_df['blueGoldearned'],name='Blue Win',opacity=1),
go.Histogram(x=red_df['redGoldearned'],name='Red Win',opacity=0.5)
])
fig.update_layout(
title='Gold Earned Distribution 2 Teams',
height=800,
width=800
)
iplot(fig)
plt.figure(figsize=(12,6))
blue_df['blueGoldearned'].hist(color='blue', alpha=.5, bins=20)
red_df['redGoldearned'].hist(color='red',alpha=.5, bins=20)
plt.title("Gold Earned 2 Teams")
# deal with the NaN values in the data
sns.heatmap(blue_df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
# deal with the NaN values in the data
sns.heatmap(red_df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features = data.drop(['blueWin','redWin','gameId'],axis=1)
# Fit Scaler to the features
scaler.fit(features)
scaled_features = scaler.transform(features)
df_features = pd.DataFrame(scaled_features, columns= features.columns)
X = df_features
y = data['redWin']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state=42)
# checking shapes of each
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test.shape: ", y_test.shape)
sns.scatterplot(data=red_df, x='redGoldearned', y='redKills', hue='redWin')
linear_reg = linear_model.LinearRegression()
linear_reg.fit(X_train, y_train)
print("y = x *", linear_reg.coef_, "+", linear_reg.intercept_)
train_predicted = linear_reg.predict(X_train)
test_predicted = linear_reg.predict(X_test)
#MSE
mse_train = ((np.array(y_train)-train_predicted)**2).sum()/len(y_train)
mse_test = ((np.array(y_test)-test_predicted)**2).sum()/len(y_test)
#R2
r2_train = r2_score(y_train, train_predicted)
r2_test = r2_score(y_test, test_predicted)
print("MSE of Training set:", mse_train)
print("MSE of Testing set:", mse_test)
print("R Squared of Training set:", r2_train)
print("R Squared of Testing set:", r2_test)
Comments: This linear model is good fit. This shows that any gold earned can predict the outcome of blue killed or red killed.
feature_cols = data.columns.drop(['gameId','blueWin','redWin'])
feature_cols
estimator = LogisticRegression(max_iter=1000,C=1000, random_state=0)
selector = RFE(estimator, 9) # select 9 features for us
selector = selector.fit(X_train, y_train)
supp = selector.get_support()
print("Selected features:", feature_cols[supp])
feature_cols = ['blueGoldearned', 'blueChamplevel', 'blueNeutralminionskilled',
'redGoldearned', 'redChamplevel', 'redNeutralminionskilled',
'blueTowerkills', 'redTowerkills', 'redBaronkills']
#Scale the data
features = data[feature_cols]
# Fit Scaler to the features
scaler.fit(features)
scaled_features = scaler.transform(features)
df_features = pd.DataFrame(scaled_features, columns= features.columns)
X = df_features
y = data['redWin']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state=42)
# checking shapes of each
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test.shape: ", y_test.shape)
Fine-tuning LR model by changing C values
# training model
lr = LogisticRegression(max_iter=200,C=0.1, random_state=0).fit(X_train, y_train)
# Evaluate the model
# making predictions on training and test set
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
# calculating accuracy score on training set
print("Acc on training set: ", accuracy_score(y_train, y_pred_train))
# calculating accuracy score on test set
print("Acc on test set: ", accuracy_score(y_test, y_pred_test))
print("Confusion matrix of training model")
print(confusion_matrix(y_train, y_pred_train))
print("Confusion matrix of testing model")
print(confusion_matrix(y_test, y_pred_test))
Comments: The logistic regression is overfitting because the model is very complexity and have many features relative to observations. Therefore, it is reason for not using this model to predict the outcome of winning.
By using #Tensorflow
Version: 2.2.0
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from scipy.stats import ttest_ind
import warnings
warnings.filterwarnings("ignore")
df_features.columns
feat_cols = []
for col in df_features.columns:
feat_cols.append(tf.feature_column.numeric_column(col))
feat_cols
input_func = tf.compat.v1.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=10000, num_epochs=None, shuffle=True)
classifier = tf.estimator.DNNClassifier(hidden_units=[100, 100, 100], n_classes=2, feature_columns=feat_cols)
Comments: By default, activation function is relu
classifier.train(input_fn=input_func,steps=500)
prediction_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(x=X_test, batch_size=len(X_test), shuffle=False)
predictions = list(classifier.predict(input_fn=prediction_fn))
predictions[0]
final_preds = []
for pred in predictions:
final_preds.append(pred['class_ids'][0])
print("Confusion matrix of testing model")
print(confusion_matrix(y_test, final_preds))
array = [[18275 ,1558],
[ 364 ,19788]]
df_cm = pd.DataFrame(array, range(2), range(2))
# plt.figure(figsize=(10,7))
sns.set(font_scale=1.4) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()
print(classification_report(y_test,final_preds))
accuracy_NN = accuracy_score(y_test, final_preds)
print(accuracy_NN)
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
# Training a regression tree with max_depth=4.
regressor = DecisionTreeRegressor(random_state=42, max_depth=4)
regressor.fit(X_train, y_train)
# Visualize the tree structure.
fig, ax = plt.subplots(figsize=(20, 10))
tree.plot_tree(regressor, filled=True, fontsize=10)
plt.show()
# Evaluate the model with the testing data
y_pred = regressor.predict(X_test)
score = r2_score(y_pred, y_test)
print("\nThe r2 score is: %.4f\n" % score)
mse_test = mean_squared_error(y_pred, y_test)
mse_train = mean_squared_error(regressor.predict(X_train), y_train)
print("The mean squared error (testing) is: %.4f\n" % mse_test)
print("The mean squared error (training) is: %.4f\n" % mse_train)
Conclusion: